rm(list=ls())
##set the data path
setwd("E:/data/cooperation/public service/data1")
##load the library----
library(foreign)
library(xlsxjars)
library(xlsx)
library(ggplot2)
library(gridExtra)
library(randomForest)
library(readstata13)
library(stringr)
library(moments)
###load the data----
da<-read.dta13("public.dta")
dim(da)
name<-colnames(da)
a<-c(which(name=="year"),which(name=="countrycode"),which(name=="regioncode"))
da_new<-da[,-a]
num_test<-which(is.na(da_new$ffp_ps))
num_train<-(1:2258)[-num_test]
da_test<-da_new[num_test,]
da_train<-da_new[num_train,]
##Mean imputation----
set.seed(100000)
country<-names(table(da$countrycode))
for (i in 1:length(country))
{
  num<-which(da$countrycode==country[i])
  da_1<-da[num,]
  num1<-which(is.na(da_1$ffp_ps))
  d<-sd(da_1$ffp_ps,na.rm=T)
  a<-rnorm(length(num1),0,d)
  print(mean(da_1$ffp_ps,na.rm=T))
  da_1$ffp_ps[num1]<-mean(da_1$ffp_ps,na.rm=T)+a
  
  da[num,]<-da_1
}

write.dta(da,file="da_mean.dta")##simple data mining
summary(da$ffp_ps)
###Multiple imputation------
library(mice)
set.seed(100000)
da<-read.dta13("public.dta")
re<-mice(da, meth=c('pmm'))
da$ffp_ps[which(is.na(da$ffp_ps))]<-re$imp$ffp_ps[,1]

write.dta(da,file="da_EMB.dta")

##Random forest interpolation----
set.seed(100000)
result<-randomForest(ffp_ps~.,data=da_train,importance=TRUE, na.action=na.omit,ntree=1000)
da_1<-predict(result,da_test)
#da_1<-predict(result,da_test)
da$ffp_ps[num_test]<-da_1   #new data after data mining
da$ffp_ps<-as.numeric(da$ffp_ps)
da_base<-da
write.dta(da_base,file="da_random.dta")##simple data mining



##Country screening-----
a<-table(da$countrycode)
countryname<-names(a)
write.csv(countryname,file="country.csv")

##Statistical Analysis-----
da<-read.dta13("original.dta")
na<-names(table(da$countrycode))
na2<-colnames(da)
na1<-c("ffp_ps","wdi_internetuse","yr_sch_2","lpop_dnst","fh_ipolity2","hf_efiscore", 
  "wbgi_pse","wbgi_gee","ciri_polpris","lgdppc","wdi_telephone","lpat_uspt")
n<-match(na1,na2)
da_na<-da[,n]



##Mean imputed distribution----
da_m<-read.dta13("da_mean.dta")
da_m1<-da_m[num_test,]
q2<-ggplot(da_m1, aes(x=ffp_ps)) +
  geom_histogram(aes(y = ..density..),
                 bins = 40,
                 fill = "white",
                 color = "black") +
  stat_density(
    geom = 'line',
    position = 'identity',
    size = 1,
    color = "gray10"
  ) +
  theme_bw() +
  ylim(0,0.03)+
  theme(
    panel.grid = element_blank(),
    panel.border = element_blank(),
    axis.title.x=element_text(hjust = 0.5, size = 7),
    axis.title.y=element_text(hjust = 0.5, size = 7),
    plot.title = element_text(hjust = 0.5, size = 7),
    axis.line = element_line()
  ) +
  labs(x="Random imputation",y=NULL,title="(b)")
##Distribution of raw data----
da_o<-read.dta13("original.dta")
da_o$ffp_ps<-da_o$ffp_ps*10
q1<-ggplot(da_o, aes(x=ffp_ps)) +
  geom_histogram(aes(y = ..density..),
                 bins = 40,
                 fill = "white",
                 color = "black") +
  stat_density(
    geom = 'line',
    position = 'identity',
    size = 1,
    color = "gray10"
  ) +
  theme_bw() +
  ylim(0,0.03)+
  theme(
    panel.grid = element_blank(),
    panel.border = element_blank(),
    axis.title.x=element_text(hjust = 0.5, size = 7),
    axis.title.y=element_text(hjust = 0.5, size = 7),
    plot.title = element_text(hjust = 0.5, size = 7),
    axis.line = element_line()
  ) +
  labs(x="Original data",y="Probability density",title="(a)")
  
##Distribution of multiple imputation----
da_e<-read.dta13("da_EMB.dta")
da_e1<-da_e[num_test,]

q3<-ggplot(da_e1, aes(x=ffp_ps)) +
  geom_histogram(aes(y = ..density..),
                 bins = 40,
                 fill = "white",
                 color = "black") +
  stat_density(
    geom = 'line',
    position = 'identity',
    size = 1,
    color = "gray10"
  ) +
  theme_bw() +
  ylim(0,0.03)+
  theme(
    panel.grid = element_blank(),
    panel.border = element_blank(),
    axis.title.x=element_text(hjust = 0.5, size = 7),
    axis.title.y=element_text(hjust = 0.5, size = 7),
    plot.title = element_text(hjust = 0.5, size = 7),
    axis.line = element_line()
  ) +
  labs(x="Multiple imputation",y="Probability density",title="(c)")
##Distribution of Big Data Imputation----
da_b<-read.dta13("da_random.dta")
da_b1<-da_b[num_test,]
q4<-ggplot(da_b1, aes(x=ffp_ps)) +
  geom_histogram(aes(y = ..density..),
                 bins = 40,
                 fill = "white",
                 color = "black") +
  stat_density(
    geom = 'line',
    position = 'identity',
    size = 1,
    color = "gray10"
  ) +
  ylim(0,0.03)+
  theme_bw() +
  theme(
    panel.grid = element_blank(),
    panel.border = element_blank(),
    axis.title.x=element_text(hjust = 0.5, size = 7),
    axis.title.y=element_text(hjust = 0.5, size = 7),
    plot.title = element_text(hjust = 0.5, size = 7),
    axis.line = element_line()
  ) +
  labs(x="Machine learning imputation",y=NULL,title="(d)")
grid.arrange(q1,q2,q3,q4,ncol=2)



##Missing distribution----
da<-read.dta13("original.dta")
names(table(da$countrycode))#116 countries
da<-data.frame(id=da$countrycode,ffp_ps=da$ffp_ps)
da2<-split(da,da$id)  #Split data by country
country<-labels(da2)
re<-colSums(is.na(da2[[1]]))/dim(da2[[1]])[1]
for (i in 2:length(country))
{
  re1<-colSums(is.na(da2[[i]]))/dim(da2[[i]])[1]
  re<-cbind(re,re1)
}
colnames(re)<-country
re<-t(re)
write.csv(re,file="balance_country.csv")
re<-re[,2]
re<-as.data.frame(re)
q1<-ggplot(re,aes(x=re))+ 
  geom_histogram(aes(y = ..density..),
                 bins = 8,
                 fill = "white",
                 color = "black")+
  theme_bw()+
  labs(y=NULL,size=1,x="Ratio of missing data",title="(a)")+
  theme(axis.text.x = element_text(family="Arial", size=8))+
  theme(title=element_text(size=8),legend.position="none")+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())

da<-read.dta13("original.dta")
names(table(da$countrycode))
da<-data.frame(id=da$year,ffp_ps=da$ffp_ps)
da2<-split(da,da$id) 
country<-labels(da2)
re<-colSums(is.na(da2[[1]]))/dim(da2[[1]])[1]
for (i in 2:length(country))
{
  re1<-colSums(is.na(da2[[i]]))/dim(da2[[i]])[1]
  re<-cbind(re,re1)
}
colnames(re)<-country
re<-t(re)
write.csv(re,file="balance_year.csv")
re<-as.data.frame(re)
re$id=rownames(re)
re$id<-as.numeric(re$id)
q2<-ggplot(re,aes(id,ffp_ps))+  
  geom_bar(stat='identity',
           width=0.5,
           fill = "gray",
           color = "black")+
  geom_point(cex=0.4)+
  geom_line(linetype="dotted",cex=1)+
  theme_bw()+
  xlim(1995,2014)+
  labs(y="Ratio of missing data",size=0.1,x="Year",title="(b)")+
  theme(axis.text.x = element_text(family="Arial", size=8))+
  theme(title=element_text(size=8),legend.position="none")+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())

grid.arrange(q1,q2,ncol=2)

##The importance of variables----- 
da<-read.dta13("original.dta")
da<-da[,4:32]
result<-randomForest(ffp_ps~.,data=da,importance=TRUE, na.action=na.omit,ntree=1000)
im<-importance(result)
write.csv(im,file="im.csv")

##Statistical description----
da<-read.dta13("original.dta")
da1<-read.dta13("da_mean.dta")
da2<-read.dta13("da_random.dta")
da5<-read.dta13("da_EMB.dta")
da5$original<-0
da5$original[num_test]<-1
da1$original<-0
da1$original[num_test]<-1
da2$original<-0
da2$original[num_test]<-1
da$ffp_ps=(1-(da$ffp_ps-1)/9)*100

length(da1$ffp_ps)-sum(is.na(da1$ffp_ps))
mean(da1$ffp_ps)
sd(da1$ffp_ps)
range(da1$ffp_ps)
skewness(da1$ffp_ps)
kurtosis(da1$ffp_ps)

length(da$ffp_ps)-sum(is.na(da$ffp_ps))
mean(da$ffp_ps,na.rm=T)
sd(da$ffp_ps,na.rm=T)
range(da$ffp_ps,na.rm=T)
skewness(da$ffp_ps,na.rm=T)
kurtosis(da$ffp_ps,na.rm=T)

length(da2$ffp_ps)-sum(is.na(da2$ffp_ps))
mean(da2$ffp_ps,na.rm=T)
sd(da2$ffp_ps,na.rm=T)
range(da2$ffp_ps,na.rm=T)
skewness(da2$ffp_ps,na.rm=T)
kurtosis(da2$ffp_ps,na.rm=T)

length(da5$ffp_ps)-sum(is.na(da5$ffp_ps))
mean(da5$ffp_ps,na.rm=T)
sd(da5$ffp_ps,na.rm=T)
range(da5$ffp_ps,na.rm=T)
skewness(da5$ffp_ps,na.rm=T)
kurtosis(da5$ffp_ps,na.rm=T)


da3<-da1[which(da1$original==1),]
da4<-da2[which(da2$original==1),]
da6<-da5[which(da5$original==1),]

length(da3$ffp_ps)-sum(is.na(da3$ffp_ps))
mean(da3$ffp_ps,na.rm=T)
sd(da3$ffp_ps,na.rm=T)
range(da3$ffp_ps,na.rm=T)
skewness(da3$ffp_ps,na.rm=T)
kurtosis(da3$ffp_ps,na.rm=T)


length(da4$ffp_ps)-sum(is.na(da4$ffp_ps))
mean(da4$ffp_ps,na.rm=T)
sd(da4$ffp_ps,na.rm=T)
range(da4$ffp_ps,na.rm=T)
skewness(da4$ffp_ps,na.rm=T)
kurtosis(da4$ffp_ps,na.rm=T)

length(da6$ffp_ps)-sum(is.na(da6$ffp_ps))
mean(da6$ffp_ps,na.rm=T)
sd(da6$ffp_ps,na.rm=T)
range(da6$ffp_ps,na.rm=T)
skewness(da6$ffp_ps,na.rm=T)
kurtosis(da6$ffp_ps,na.rm=T)



##Graphical analysis results----
library(foreign)
library(VGAM)
a<-read.csv("a.csv",header = T,sep = ",")
a<-as.data.frame(a)
b<-read.csv("color1.csv",header = T)
k<-seq(1,12,3)
#levels(a[,1])<-c("Internet users—FE","R-Square—FE","Internet users—IV","R-Square—IV" )
dev.copy(postscript, file="Fig 1.eps", height=6, width=6, horizontal=F, onefile=F)
op<-par(mfrow=c(2,2),mai=c(0.3,0.8,0.3,0.4),oma=c(2,3,2,0.1),las=1)
boxplot(model.1~num,data=a,main = "Original",cex.axis=0.8,font.main=2,names=a$label[k],horizontal = TRUE, pch = 2,pars = list(boxwex = 0.4, staplewex = 0.4, outwex = 0.3),col=gray(b$model.1),ylim=c(-0.3,0.8),ylab = NULL)
abline(v=0,col="red")
boxplot(model.2~num,data=a,main = "random mean",cex.axis=0.8,names=a$label[k],horizontal = TRUE, pch = 2,pars = list(boxwex = 0.4, staplewex = 0.4, outwex = 0.3),col=gray(b$model.3),ylim=c(-0.6,0.8),ylab = NULL)
abline(v=0,col="red")
boxplot(model.3~num,data=a,main = "multiple",cex.axis=0.8,names=a$label[k],horizontal = TRUE, pch = 2,pars = list(boxwex = 0.4, staplewex = 0.4, outwex = 0.3),col=gray(b$model.2),ylim=c(-0.3,0.8),ylab = NULL)
abline(v=0,col="red")
boxplot(model.4~num,data=a,main = "machine learning",cex.axis=0.8,names=a$label[k],horizontal = TRUE, pch = 2,pars = list(boxwex = 0.4, staplewex = 0.4, outwex = 0.3),col=gray(b$model.4),ylim=c(-0.6,0.8),ylab = NULL)
abline(v=0,col="red")
dev.off()


